This report provides an exploratory data analysis (EDA) of the processed Heart Disease Training dataset.
train <- read.csv("../data/processed/train_scaled.csv", stringsAsFactors = TRUE)
print_basic_info <- function(df, target_col) {
cat("\nData overview\n")
print(dim(df))
str(df)
cat("\n\nSummary:\n")
print(summary(df))
cat("\nMissing values:\n")
print(check_missing(df))
cat("\nTarget class balance:\n")
print(class_balance(df))
}
print_basic_info(train, "HeartDisease")
##
## Data overview
## [1] 550 12
## 'data.frame': 550 obs. of 12 variables:
## $ Age : num 0.0757 0.6163 -1.7625 -0.8975 -1.33 ...
## $ Sex : Factor w/ 2 levels "F","M": 2 2 2 2 1 2 1 2 2 1 ...
## $ ChestPainType : Factor w/ 4 levels "ASY","ATA","NAP",..: 1 1 3 3 2 1 1 1 2 2 ...
## $ RestingBP : num -0.14 -0.563 -0.14 -0.14 -0.405 ...
## $ Cholesterol : num -1.8373 0.3152 -0.0451 0.3429 -0.1375 ...
## $ FastingBS : num 1.735 -0.575 -0.575 -0.575 -0.575 ...
## $ RestingECG : Factor w/ 3 levels "LVH","Normal",..: 2 2 2 2 2 1 3 2 2 2 ...
## $ MaxHR : num -1.113 -0.83 0.503 0.261 1.715 ...
## $ ExerciseAngina: Factor w/ 2 levels "N","Y": 2 2 1 1 1 2 2 1 1 1 ...
## $ Oldpeak : num 1.946 0.36 -0.853 -0.76 -0.853 ...
## $ ST_Slope : Factor w/ 3 levels "Down","Flat",..: 2 1 3 3 3 2 2 2 3 3 ...
## $ HeartDisease : Factor w/ 2 levels "No","Yes": 2 2 1 1 1 2 2 2 1 1 ...
##
##
## Summary:
## Age Sex ChestPainType RestingBP Cholesterol
## Min. :-2.73561 F:126 ASY:289 Min. :-7.0182 Min. :-1.8373
## 1st Qu.:-0.68120 M:424 ATA:112 1st Qu.:-0.6692 1st Qu.:-0.2021
## Median : 0.07569 NAP:122 Median :-0.1402 Median : 0.2043
## Mean : 0.00000 TA : 27 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.72445 3rd Qu.: 0.3889 3rd Qu.: 0.6270
## Max. : 2.56261 Max. : 3.5634 Max. : 3.3730
## FastingBS RestingECG MaxHR ExerciseAngina
## Min. :-0.5754 LVH :113 Min. :-3.13256 N:333
## 1st Qu.:-0.5754 Normal:337 1st Qu.:-0.70872 Y:217
## Median :-0.5754 ST :100 Median : 0.09923
## Mean : 0.0000 Mean : 0.00000
## 3rd Qu.:-0.5754 3rd Qu.: 0.73549
## Max. : 1.7347 Max. : 2.60387
## Oldpeak ST_Slope HeartDisease
## Min. :-2.2531 Down: 37 No :248
## 1st Qu.:-0.8533 Flat:282 Yes:302
## Median :-0.2467 Up :231
## Mean : 0.0000
## 3rd Qu.: 0.5465
## Max. : 4.9325
##
## Missing values:
## Age Sex ChestPainType RestingBP Cholesterol
## 0 0 0 0 0
## FastingBS RestingECG MaxHR ExerciseAngina Oldpeak
## 0 0 0 0 0
## ST_Slope HeartDisease
## 0 0
##
## Target class balance:
##
## No Yes
## 248 302
numeric_cols <- c("Age", "RestingBP", "Cholesterol",
"FastingBS", "MaxHR", "Oldpeak")
cat_cols <- c("Sex", "ChestPainType", "RestingECG",
"ExerciseAngina", "ST_Slope")
##############################################################
# T-TEST
##############################################################
t_test <- function(feature_name, group_no, group_yes) {
n_no <- length(group_no)
n_yes <- length(group_yes)
mean_no <- mean(group_no)
mean_yes <- mean(group_yes)
variance_no <- var(group_no)
variance_yes <- var(group_yes)
# t-value
t_value <- (mean_no - mean_yes) / sqrt(variance_no/n_no + variance_yes/n_yes)
# formula -> from Stat Course
df <- (variance_no/n_no + variance_yes/n_yes)**2 /
((variance_no**2)/(n_no**2 * (n_no - 1)) +
(variance_yes**2)/(n_yes**2 * (n_yes - 1)))
# p-value
p_value <- 2 * (1 - pt(abs(t_value), df))
# print all the conclusion
cat("\n----------------------------------------------------------------\n")
cat("Feature:", feature_name, "\n\n")
# Hypotheses
cat("H0(Null Hypothesis): ")
cat("People with and without heart disease have similar", feature_name, ".\n\n")
cat("Ha(Alternative Hypothesis): ")
cat("The two groups do NOT have similar", feature_name, ".\n\n")
# Test values
cat("Test Results:\n")
cat(" t-value:", round(t_value, 4), "\n")
cat(" p-value:", format(p_value, scientific = TRUE), "\n\n")
cat("Group Averages:\n")
cat(" No heart disease:", round(mean_no, 4), "\n")
cat(" Yes heart disease:", round(mean_yes, 4), "\n\n")
# Decision
if (p_value < 0.05) {
cat("Decision:")
cat(" The test is significant, so We reject the null hypothesis.\n\n")
if (mean_yes > mean_no) {
cat("Conclusion:")
cat(" People with heart disease tend to have higher", feature_name, ".\n")
cat(" This feature seems related to heart disease.\n")
} else {
cat("Conclusion:")
cat(" People without heart disease tend to have higher", feature_name, ".\n")
cat(" The feature still differs between groups.\n")
}
} else {
cat("Decision:")
cat(" The test is not significant, so We do not reject the null hypothesis.\n\n")
cat("Conclusion:")
cat(" This feature does not show a meaningful difference between the groups.\n")
}
}
##############################################################
# CHI-SQUARE TEST
##############################################################
chi_square <- function(feature_name, feature_vector, target_vector) {
# Create contingency table - Stat Course
tbl <- table(feature_vector, target_vector)
observed <- as.matrix(tbl)
# Row totals, column totals, grand total
row_totals <- rowSums(observed)
col_totals <- colSums(observed)
grand_total <- sum(observed)
# Expected counts (row total * col total / grand total)
expected <- outer(row_totals, col_totals) / grand_total
# Chi-square statistic
chi_square_value <- sum((observed - expected)**2 / expected)
# Degrees of freedom
df <- (nrow(observed) - 1) * (ncol(observed) - 1)
# p-value
p_value <- 1 - pchisq(chi_square_value, df)
# Print all conclusions
cat("\n----------------------------------------------------------------\n")
cat("Feature:", feature_name, "\n\n")
# Hypotheses
cat("H0(Null Hypothesis): ")
cat("People with and without heart disease have similar", feature_name, "distribution.\n\n")
cat("Ha(Alternative Hypothesis): ")
cat("The two groups do NOT have similar", feature_name, "distribution.\n\n")
# Test values
cat("Test Results:\n")
cat(" Chi-square value:", round(chi_square_value, 4), "\n")
cat(" p-value:", format(p_value, scientific = TRUE), "\n")
cat(" Degrees of freedom:", df, "\n\n")
# Decision
if (p_value < 0.05) {
cat("Decision:")
cat(" The test is significant, so we reject the null hypothesis.\n\n")
cat("Conclusion:")
cat(" This feature seems related to heart disease.\n")
} else {
cat("Decision:")
cat(" The test is not significant, so we do not reject the null hypothesis.\n\n")
cat("Conclusion:")
cat(" This feature does not show a meaningful difference between the groups.\n")
}
}
############################################################
# Run the T-Tests for Numeric Features
############################################################
train <- read.csv("../data/processed/train_scaled.csv", stringsAsFactors = TRUE)
numeric_cols <- c("Age", "RestingBP", "Cholesterol",
"FastingBS", "MaxHR", "Oldpeak")
for (col in numeric_cols) {
group_no <- train[[col]][train$HeartDisease == "No"]
group_yes <- train[[col]][train$HeartDisease == "Yes"]
t_test(col, group_no, group_yes)
}
##
## ----------------------------------------------------------------
## Feature: Age
##
## H0(Null Hypothesis): People with and without heart disease have similar Age .
##
## Ha(Alternative Hypothesis): The two groups do NOT have similar Age .
##
## Test Results:
## t-value: -7.1309
## p-value: 3.511635e-12
##
## Group Averages:
## No heart disease: -0.3246
## Yes heart disease: 0.2665
##
## Decision: The test is significant, so We reject the null hypothesis.
##
## Conclusion: People with heart disease tend to have higher Age .
## This feature seems related to heart disease.
##
## ----------------------------------------------------------------
## Feature: RestingBP
##
## H0(Null Hypothesis): People with and without heart disease have similar RestingBP .
##
## Ha(Alternative Hypothesis): The two groups do NOT have similar RestingBP .
##
## Test Results:
## t-value: -3.0553
## p-value: 2.357454e-03
##
## Group Averages:
## No heart disease: -0.1397
## Yes heart disease: 0.1147
##
## Decision: The test is significant, so We reject the null hypothesis.
##
## Conclusion: People with heart disease tend to have higher RestingBP .
## This feature seems related to heart disease.
##
## ----------------------------------------------------------------
## Feature: Cholesterol
##
## H0(Null Hypothesis): People with and without heart disease have similar Cholesterol .
##
## Ha(Alternative Hypothesis): The two groups do NOT have similar Cholesterol .
##
## Test Results:
## t-value: 4.9079
## p-value: 1.243569e-06
##
## Group Averages:
## No heart disease: 0.2165
## Yes heart disease: -0.1778
##
## Decision: The test is significant, so We reject the null hypothesis.
##
## Conclusion: People without heart disease tend to have higher Cholesterol .
## The feature still differs between groups.
##
## ----------------------------------------------------------------
## Feature: FastingBS
##
## H0(Null Hypothesis): People with and without heart disease have similar FastingBS .
##
## Ha(Alternative Hypothesis): The two groups do NOT have similar FastingBS .
##
## Test Results:
## t-value: -6.0568
## p-value: 2.608917e-09
##
## Group Averages:
## No heart disease: -0.268
## Yes heart disease: 0.2201
##
## Decision: The test is significant, so We reject the null hypothesis.
##
## Conclusion: People with heart disease tend to have higher FastingBS .
## This feature seems related to heart disease.
##
## ----------------------------------------------------------------
## Feature: MaxHR
##
## H0(Null Hypothesis): People with and without heart disease have similar MaxHR .
##
## Ha(Alternative Hypothesis): The two groups do NOT have similar MaxHR .
##
## Test Results:
## t-value: 9.6449
## p-value: 0e+00
##
## Group Averages:
## No heart disease: 0.4203
## Yes heart disease: -0.3451
##
## Decision: The test is significant, so We reject the null hypothesis.
##
## Conclusion: People without heart disease tend to have higher MaxHR .
## The feature still differs between groups.
##
## ----------------------------------------------------------------
## Feature: Oldpeak
##
## H0(Null Hypothesis): People with and without heart disease have similar Oldpeak .
##
## Ha(Alternative Hypothesis): The two groups do NOT have similar Oldpeak .
##
## Test Results:
## t-value: -11.4541
## p-value: 0e+00
##
## Group Averages:
## No heart disease: -0.4653
## Yes heart disease: 0.3821
##
## Decision: The test is significant, so We reject the null hypothesis.
##
## Conclusion: People with heart disease tend to have higher Oldpeak .
## This feature seems related to heart disease.
cat("\n******* Chi-Square Tests for Categorical Features *******\n")
##
## ******* Chi-Square Tests for Categorical Features *******
############################################################
# Run Chi-Square Tests for Each Categorical Feature
############################################################
train <- read.csv("../data/processed/train_scaled.csv", stringsAsFactors = TRUE)
categorical_cols <- c("Sex", "ChestPainType", "RestingECG",
"ExerciseAngina", "ST_Slope")
for (col in categorical_cols) {
chi_square(col, train[[col]], train$HeartDisease)
}
##
## ----------------------------------------------------------------
## Feature: Sex
##
## H0(Null Hypothesis): People with and without heart disease have similar Sex distribution.
##
## Ha(Alternative Hypothesis): The two groups do NOT have similar Sex distribution.
##
## Test Results:
## Chi-square value: 60.6302
## p-value: 6.883383e-15
## Degrees of freedom: 1
##
## Decision: The test is significant, so we reject the null hypothesis.
##
## Conclusion: This feature seems related to heart disease.
##
## ----------------------------------------------------------------
## Feature: ChestPainType
##
## H0(Null Hypothesis): People with and without heart disease have similar ChestPainType distribution.
##
## Ha(Alternative Hypothesis): The two groups do NOT have similar ChestPainType distribution.
##
## Test Results:
## Chi-square value: 196.1064
## p-value: 0e+00
## Degrees of freedom: 3
##
## Decision: The test is significant, so we reject the null hypothesis.
##
## Conclusion: This feature seems related to heart disease.
##
## ----------------------------------------------------------------
## Feature: RestingECG
##
## H0(Null Hypothesis): People with and without heart disease have similar RestingECG distribution.
##
## Ha(Alternative Hypothesis): The two groups do NOT have similar RestingECG distribution.
##
## Test Results:
## Chi-square value: 5.9355
## p-value: 5.141916e-02
## Degrees of freedom: 2
##
## Decision: The test is not significant, so we do not reject the null hypothesis.
##
## Conclusion: This feature does not show a meaningful difference between the groups.
##
## ----------------------------------------------------------------
## Feature: ExerciseAngina
##
## H0(Null Hypothesis): People with and without heart disease have similar ExerciseAngina distribution.
##
## Ha(Alternative Hypothesis): The two groups do NOT have similar ExerciseAngina distribution.
##
## Test Results:
## Chi-square value: 110.1068
## p-value: 0e+00
## Degrees of freedom: 1
##
## Decision: The test is significant, so we reject the null hypothesis.
##
## Conclusion: This feature seems related to heart disease.
##
## ----------------------------------------------------------------
## Feature: ST_Slope
##
## H0(Null Hypothesis): People with and without heart disease have similar ST_Slope distribution.
##
## Ha(Alternative Hypothesis): The two groups do NOT have similar ST_Slope distribution.
##
## Test Results:
## Chi-square value: 206.9801
## p-value: 0e+00
## Degrees of freedom: 2
##
## Decision: The test is significant, so we reject the null hypothesis.
##
## Conclusion: This feature seems related to heart disease.
pie_data <- train %>%
count(HeartDisease) %>%
mutate(prop = n / sum(n),
lbl = paste0(HeartDisease, " (", scales::percent(prop), ")"))
ggplot(pie_data, aes(x = "", y = prop, fill = HeartDisease)) +
geom_col(width = 1, color = "white") +
coord_polar(theta = "y") +
theme_void() +
labs(title = "HeartDisease Class Distribution (Pie Chart)") +
geom_text(aes(label = lbl), position = position_stack(vjust = 0.5))
for (col in numeric_cols) {
print(
ggplot(train, aes_string(x = col)) +
geom_histogram(bins = 30, fill = "skyblue", color = "black") +
theme_minimal() +
labs(title = paste("Distribution of", col))
)
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
for (col in cat_cols) {
print(
ggplot(train, aes_string(x = col)) +
geom_bar(fill = "orange", color = "black") +
theme_minimal() +
labs(title = paste("Distribution of", col))
)
}
for (col in numeric_cols) {
print(
ggplot(train, aes_string(x = "HeartDisease", y = col, fill = "HeartDisease")) +
geom_boxplot() +
theme_minimal() +
labs(title = paste(col, "by HeartDisease"))
)
}
for (col in cat_cols) {
print(
ggplot(train, aes_string(x = col, fill = "HeartDisease")) +
geom_bar(position = "fill") +
theme_minimal() +
labs(title = paste(col, "vs HeartDisease (Proportion)"),
y = "Proportion")
)
}
numeric_data <- train[, numeric_cols]
cor_mat <- cor(numeric_data)
corrplot(cor_mat,
method = "color",
type = "upper",
addCoef.col = "black",
number.cex = 0.7,
tl.cex = 0.8,
tl.col = "black")
key_pairs <- c("RestingBP", "Cholesterol", "MaxHR", "Oldpeak")
for (col in key_pairs) {
print(
ggplot(train, aes_string(x = "Age", y = col, color = "HeartDisease")) +
geom_point(alpha = 0.6, size = 2) +
theme_minimal() +
labs(
title = paste("Scatter Plot:", "Age vs", col),
x = "Age",
y = col
)
)
}
ggplot(train, aes(Age, MaxHR, color = HeartDisease)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "loess", se = TRUE) +
theme_minimal() +
labs(title = "Age vs MaxHR with Trend Line")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(train, aes(MaxHR, Oldpeak, color = HeartDisease)) +
geom_point(alpha = 0.6) +
facet_wrap(~ Sex) +
theme_minimal() +
labs(title = "MaxHR vs Oldpeak (Faceted by Sex)")
ggpairs(
train[, c(numeric_cols, "HeartDisease")],
aes(color = HeartDisease, alpha = 0.5)
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.